Praktikum Deep Learning

Optimierung Teil 2

Beispiel anhand MNIST

Die MNIST-Datenbank (Modified National Institute of Standards and Technology database) ist eine öffentlich verfügbare Datenbank von handgeschriebenen Ziffern. Die MNIST-Datenbank besteht aus 60.000 Beispielen (28x28 grayscale Bilder) im Trainingsdatensatz und 10.000 Beispielen im Testdatensatz.

MNIST Beispiele

Wir werden eine einfache Neurale Netzwerk aus 2 hidden Schichten (500 bzw. 50 Knoten) bilden und werden uns Loss Landschaft für 2 Gewichte in letzten Layer anschauen. Überwachung diese 2 Gewichte lässt sich wieder qualitative Unterschied zwischen Algorythmen zeigen. structure

Lassen uns anfangen. Wir importieren alle benötigte Module und bereiten Daten vor.

In [10]:
import numpy as np
from keras.datasets import mnist
from sklearn.preprocessing import OneHotEncoder
from scipy.special import expit
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly import tools
import plotly.offline

plotly.offline.init_notebook_mode(connected=True)

Die Eingabedaten sind 60,000 grayscale 28x28 Bilder. \ D.h. die X-Werte sind die Zahlen zwischen 0 und 255. Für Normalisierung reicht sie durch 255 zu teilen. \ Wir werden auch sie aus 28x28 Matrice zum 1x784 Vektor strecken.

In [11]:
oh=OneHotEncoder(categories='auto') # Setup OneHotEncoder für y
In [12]:
# Daten Laden
(train_X, train_y), (test_X, test_y) = mnist.load_data()
train_X = train_X.reshape(-1, 784) # strecken
train_X = train_X / 255 # normieren
train_y_oh = oh.fit_transform(train_y.reshape(-1, 1)).toarray()  # one-hot-encoding of y-values

Als Aktivierung-Funktion wird Sigmoid benutzt $S(x)=\frac{1}{1+e^{-x}}$

In [13]:
hidden_0=50 # Anzahl Knoten in ersten hidden Layer
hidden_1=500 # Anzahl Knoten in zweiten hidden Layer

# Kosten Funktion:
def costs(x,y,w_a,w_b,seed_):  
        np.random.seed(seed_) # insert random seed 
        w0 = np.random.randn(hidden_0,784)  # weight matrix of 1st hidden layer
        w1 = np.random.randn(hidden_1,hidden_0) # weight matrix of 2nd hidden layer
        w2 = np.random.randn(10,hidden_1) # weight matrix of output layer
        w2[5][250] = w_a # set value for weight w_250,5(2)
        w2[5][251] = w_b # set value for weight w_251,5(2)
        a0 = expit(w0 @ x.T)  # output of 1st hidden layer
        a1 = expit(w1 @ a0)  # output of 2nd hidden layer
        pred = expit(w2 @ a1) # output of final layer
        return np.mean(np.sum((y-pred)**2,axis=0)) # Kosten in Bezug auf w_a and w_b
In [14]:
# Meshgrid: 
m1s = np.linspace(-15, 17, 40)   
m2s = np.linspace(-15, 18, 40)  
M1, M2 = np.meshgrid(m1s, m2s) # Meshgrid erstellen

# Besteimmen die Kosten in Meshgrid: 
zs_100 = np.array([costs(train_X[0:100],train_y_oh[0:100].T  
                               ,np.array([[mp1]]), np.array([[mp2]]),135)  
                       for mp1, mp2 in zip(np.ravel(M1), np.ravel(M2))])
Z_100 = zs_100.reshape(M1.shape) # z-Werte für N = 100

zs_1000 = np.array([costs(train_X[0:1000],train_y_oh[0:1000].T  
                               ,np.array([[mp1]]), np.array([[mp2]]),135)  
                       for mp1, mp2 in zip(np.ravel(M1), np.ravel(M2))])
Z_1000 = zs_1000.reshape(M1.shape) # z-Werte für N = 1,000 
# N = 10,000 ist auch möglich aber dauert wesentlich länger

Plot der Loss Landschaft für unterschiedliche Anzahl von Samplen:

In [15]:
def cam_change(layout, camera):
    fig_widget.layout.scene2.camera = camera
    
fig_100 = dict(type='surface',x=M1,y=M2,z=Z_100,scene='scene1')
fig_1000 = dict(type='surface',x=M1,y=M2,z=Z_1000,scene='scene2')
fig = make_subplots(rows=1, cols=2,specs=[[{"type": "surface"},{"type": "surface"}]],
                    subplot_titles=("N:100","N:1000"))
fig.append_trace(fig_100, row=1, col=1)
fig.append_trace(fig_1000, row=1, col=2)
fig.update_traces(contours_z=dict(show=True, usecolormap=True,
                                  highlightcolor="limegreen", project_z=True))

fig_widget = go.FigureWidget(fig)
fig_widget.layout.scene1.camera.eye=dict(x=2, y=-2, z=2)   
fig_widget.layout.scene2.camera.eye=dict(x=2, y=-2, z=2) 
fig_widget.layout.scene1.on_change(cam_change, 'camera')
fig_widget.show()

Lassen uns Netzwerk trainieren, aber nur die Gewichte, die wir überwachen.

In [16]:
# Liste für Ausgabe später
methods = ['SGD', 
           'SGD-Momentum', 
           'Nesterov-SGD',
           'RMSProp',
           'Adam']
mcolors = {'SGD': ['red', 'darkred'],
           'SGD-Momentum': ['green', 'darkgreen'],
           'Nesterov-SGD': ['blue', 'darkblue'],
           'RMSProp':['yellow', 'yellow'],
           'Adam':['pink','pink']}

# Liste zum Speichern von Gewichte und Kosten: 
weights_2_5_250=[] 
weights_2_5_251=[] 
costs=[] 

seed_= 135 # random seed
N=100 # sample size 
pflag=False # Flag für Progress

# Set up neurale Netzwerk: 
class NeuralNetwork(object):
    def __init__(self, lr=0.01):
        self.lr=lr
        np.random.seed(seed_) # set random seed
        # Intialize weight matrices: 
        self.w0=np.random.randn(hidden_0,784)  
        self.w1=np.random.randn(hidden_1,hidden_0)
        self.w2=np.random.randn(10,hidden_1)
        self.w2[5][250] = start_a # set starting value for w_a
        self.w2[5][251] = start_b # set starting value for w_b
        # init Momenten für Momentum, Nesterov und RMSProp
        self.rho = 0.9
        self.vx = np.zeros((10,500)) # shape
        self.grad_squared = np.zeros((10,500))
        # Für Adam
        self.counter=0
        self.beta1 = 0.9
        self.beta2 = 0.999
        self.first_moment = 0
        self.second_moment = 0
        self.first_unbias = 0
        self.secnd_unbias = 0
        
    def evaluate_gradient(self, a1, X, y, pred):
        temp = (pred - y.T)*pred*(1-pred)  @ a1.T / len(X)
        return temp
    
    def SGD_train(self, X, y, method):
        a0 = expit(self.w0 @ X.T)  
        a1 = expit(self.w1 @ a0)  
        pred = expit(self.w2 @ a1)
        # Partialle Ableitungen für Kosten in Bezug auf Gewichte in das output Schicht: 
        dw2= self.evaluate_gradient(a1, X, y, pred)
        # Updaten NUR unsere Gewichte: 
        if method=='SGD':
            self.w2[5][250]=self.w2[5][250] - self.lr * dw2[5][250] 
            self.w2[5][251]=self.w2[5][251] - self.lr * dw2[5][251] 
            costs.append(self.cost(pred,y)) # List updaten mit Kosten
        if method=='SGD-Momentum':
            self.vx = self.rho * self.vx + dw2
            self.w2[5][250]=self.w2[5][250] - self.lr * self.vx[5][250] 
            self.w2[5][251]=self.w2[5][251] - self.lr * self.vx[5][251] 
            costs.append(self.cost(pred,y)) # List updaten mit Kosten   
        if method=='Nesterov-SGD':
            old_vx = self.vx
            self.vx = self.rho * self.vx - self.lr * dw2
            #print(old_vx)
            #print(self.vx)
            #print(self.dw2)
            self.w2[5][250]=self.w2[5][250] + self.vx[5][250] + self.rho * (self.vx[5][250]-old_vx[5][250]) 
            self.w2[5][251]=self.w2[5][251] + self.vx[5][251] + self.rho * (self.vx[5][251]-old_vx[5][251])
            costs.append(self.cost(pred,y)) # List updaten mit Kosten 
        if method=='RMSProp':
            self.grad_squared = self.rho * self.grad_squared +(1-self.rho)* dw2 * dw2
            #print(self.grad_squared.shape)
            #print(dw2.shape)
            #print(np.sqrt(self.grad_squared[5][250]))
            self.w2[5][250]=self.w2[5][250] - self.lr * dw2[5][250]/(np.sqrt(self.grad_squared[5][250])+1e-1) 
            self.w2[5][251]=self.w2[5][251] - self.lr * dw2[5][251]/(np.sqrt(self.grad_squared[5][251])+1e-1)
            costs.append(self.cost(pred,y)) # List updaten mit Kosten
        if method=='Adam':
            self.first_moment = self.beta1 * self.first_moment + (1 - self.beta1) * dw2
            self.second_moment = self.beta2 * self.second_moment + (1 - self.beta2) * dw2 * dw2
            self.first_unbias = self.first_moment / (1-self.beta1**(self.counter+1))
            self.second_unbias = self.second_moment / (1-self.beta2**(self.counter+1))
            self.counter += 1
            self.w2[5][250] = self.w2[5][250] - self.lr * self.first_unbias[5][250]/(np.sqrt(self.second_unbias[5][250])+1e-1)
            self.w2[5][251] = self.w2[5][251] - self.lr * self.first_unbias[5][251]/(np.sqrt(self.second_unbias[5][251])+1e-1)
            costs.append(self.cost(pred,y)) # List updaten mit Kosten
            
    def cost(self, pred, y):
        return np.mean(np.sum((y.T-pred)**2,axis=0))
    
# Anfangswerte für w_a/w_b: 
starting_points = [  (-9,15)]# ,(-10.1,15),(-11,15)] 
epochs = 1000 # 1,000 epochs   
for method in methods:
    print('Method: ', method)
    start_a,start_b=starting_points[0]
    model=NeuralNetwork(10) # set learning rate to 10
    for i in range(epochs):    
        model.SGD_train(train_X[0:N], train_y_oh[0:N], method) 
        weights_2_5_250.append(model.w2[5][250]) # append weight values to list
        weights_2_5_251.append(model.w2[5][251]) # append weight values to list
        if i % (int(epochs/20)) == 0 and pflag == True:
            print("Epoch ", i, " of ", epochs, ".")

print("Fertig!")

# Subliste für Kosten und Gewichte abhängig von Algorythmus: 
costs = np.split(np.array(costs),len(methods)) 
weights_2_5_250 = np.split(np.array(weights_2_5_250),len(methods))
weights_2_5_251 = np.split(np.array(weights_2_5_251),len(methods))
Method:  SGD
Method:  SGD-Momentum
Method:  Nesterov-SGD
Method:  RMSProp
Method:  Adam
Fertig!

3D Ansicht

In [17]:
# Welche Epochen zu plotten:
lower_bound = int(epochs/500)
p1=list(np.arange(0,lower_bound,20))# ungleichmäßig
p2=list(np.arange(lower_bound,epochs,100))
p3=list(np.arange(0,epochs,20)) # gleichmäßig
#points_=p1+p2
points_=p3
counter = 0
scaler=1.001
# Plotly Figur
fig_dict = {
    "data": [],
    "layout": {},
    "frames": []
}

fig_dict["data"].append(fig_100) # trace 0
for i in range(len(methods)):
    # add each graph 2 times
    fig_dict["data"].append(go.Scatter3d(x=weights_2_5_250[i][0:epochs],
                                     y=weights_2_5_251[i][0:epochs],
                                     z=costs[i][0:epochs]*scaler,   #TODO add some value to emphasize
                                     line=dict(color=mcolors[methods[i]][0],width=5),
                                     name=methods[i],mode='lines'))
    fig_dict["data"].append(go.Scatter3d(x=weights_2_5_250[i][0:epochs],
                                     y=weights_2_5_251[i][0:epochs],
                                     z=costs[i][0:epochs]*scaler,   #TODO add some value to emphasize
                                     line=dict(color=mcolors[methods[i]][0],width=5),
                                     name=methods[i], mode='lines',showlegend=True))
fig_dict["layout"]["updatemenus"] = [
    {
        "buttons": [
            {
                "args": [None, {"frame": {"duration": 500, "redraw": True},
                                "fromcurrent": True, "transition": {"duration": 300,
                                                                    "easing": "quadratic-in-out"}}],
                "label": "Play",
                "method": "animate"
            },
            {
                "args": [[None], {"frame": {"duration": 0, "redraw": True},
                                  "mode": "immediate",
                                  "transition": {"duration": 0}}],
                "label": "Pause",
                "method": "animate"
            }
        ],
        "direction": "left",
        "pad": {"r": 10, "t": 87},
        "showactive": False,
        "type": "buttons",
        "x": 0.1,
        "xanchor": "right",
        "y": 0,
        "yanchor": "top"
    }
]
sliders_dict = {
    "active": 0,
    "yanchor": "top",
    "xanchor": "left",
    "currentvalue": {
        "font": {"size": 20},
        "prefix": "Epoch:",
        "visible": True,
        "xanchor": "right"
    },
    "transition": {"duration": 300, "easing": "cubic-in-out"},
    "pad": {"b": 10, "t": 50},
    "len": 0.9,
    "x": 0.1,
    "y": 0,
    "steps": []
}
data_dict = {
        "x": M1,
        "y": M2,
        "z": Z_100,
        "type": "surface",
    }


# frames
for i in range(len(points_)):
    epoch = points_[i]
    fdata=[]
    fdata3=dict(type='surface',x=M1,y=M2,z=Z_100,name='Loss Landschaft')
    fdata.append(fdata3)
    for j in range(len(methods)):
        fdata1=go.Scatter3d(
            x=[weights_2_5_250[j][epoch]], 
            y=[weights_2_5_251[j][epoch]],
            z=[costs[j][epoch]*scaler], 
            name=methods[j] +' trace',
            mode="markers", marker=dict(color=mcolors[methods[j]][1], size=10))
        fdata2=go.Scatter3d(x=weights_2_5_250[j][0:epoch],
                            y=weights_2_5_251[j][0:epoch],
                            z=costs[j][0:epoch]*scaler,
                            line=dict(color=mcolors[methods[j]][0],width=5),
                            name=methods[j],mode='lines')  
        fdata.append(fdata1)
        fdata.append(fdata2)
    tlist=list(range(1+3*len(methods)))
    frame = go.Frame(data=fdata, traces=tlist,name= str(epoch))
    
    fig_dict["frames"].append(frame)

    slider_step = {"args": [
        [str(epoch)],
        {"frame": {"duration": 300, "redraw": True},
         "mode": "immediate",
         "transition": {"duration": 1}}
    ],
        "label": str(epoch),
        "method": "animate"}
    sliders_dict["steps"].append(slider_step)

fig_dict["layout"]["sliders"] = [sliders_dict]

fig = go.Figure(fig_dict)

fig.update_traces(contours_z=dict(show=True, usecolormap=True,
                                  highlightcolor="limegreen", project_z=True),selector=dict(type='surface'))

fig.update_layout(title='Loss landschaft und Trajektorien', #autosize=False,
                  scene_camera_eye=dict(x=2, y=-2, z=2),
                  width=800, height=800,
                  margin=dict(l=15, r=20, b=15, t=60),
                  legend=dict(
                      yanchor="top",
                      y=0.99,
                      xanchor="left",
                      x=0.01
)
)


fig.layout.scene.xaxis.range = [-15, 17]
fig.layout.scene.yaxis.range = [-15, 18]
#fig.layout.scene.zaxis.range = [-10, 10]
fig.show()

Anmerkung: Die Linien sind absichtlich höher dargestellt.

Contour Plot

In [18]:
p3=list(np.arange(0,epochs,20))
points_=p3

cont_fig = go.Contour(x=m1s,y=m2s,z=Z_100,connectgaps=True,colorscale='hsv',showscale=False)

fig = make_subplots(rows=2, cols=1,
                    subplot_titles=("Contour Plot","Loss Graph"))

fig.append_trace(cont_fig, row=1, col=1)#trace 0

for i in range(len(methods)):
    graph_fig = go.Scatter(y=costs[i],name=methods[i],
                           showlegend=False,line=dict(color=mcolors[methods[i]][0]))
    marker_line_fig = go.Scatter(
        x=weights_2_5_250[i], 
        y=weights_2_5_251[i], 
        name=methods[i],
        marker=dict(color=mcolors[methods[i]][0], size=7))
    marker_fig = go.Scatter(
        x=[weights_2_5_250[i][0]], 
        y=[weights_2_5_251[i][0]], 
        name=methods[i]+' trace',
        mode="markers", marker=dict(color=mcolors[methods[i]][1], size=10))
    fig.append_trace(graph_fig, row=2, col=1)#trace 1 + 3*(i-1)
    fig.append_trace(marker_line_fig, row=1, col=1)#trace 2 + 3*(i-1)
    fig.append_trace(marker_fig, row=1, col=1)#trace 3 + 3*(i-1)

fig['layout']['height']=1000

# Update xaxis properties
fig.update_xaxes(title_text="w2", range=[-15,17], row=1, col=1)
fig.update_xaxes(title_text="Epochs", row=2, col=1)


# Update yaxis properties
fig.update_yaxes(title_text="w1", range=[-15,18],row=1, col=1)
fig.update_yaxes(title_text="Costs", row=2, col=1)

fig["layout"]["updatemenus"] = [
    {
        "buttons": [
            {
                "args": [None, {"frame": {"duration": 500, "redraw": True},
                                "fromcurrent": True, "transition": {"duration": 300,
                                                                    "easing": "quadratic-in-out"}}],
                "label": "Play",
                "method": "animate"
            },
            {
                "args": [[None], {"frame": {"duration": 0, "redraw": True},
                                  "mode": "immediate",
                                  "transition": {"duration": 0}}],
                "label": "Pause",
                "method": "animate"
            }
        ],
        "direction": "left",
        "pad": {"r": 10, "t": 87},
        "showactive": False,
        "type": "buttons",
        "x": 0.1,
        "xanchor": "right",
        "y": 0,
        "yanchor": "top"
    }
]
sliders_dict = {
    "active": 0,
    "yanchor": "top",
    "xanchor": "left",
    "currentvalue": {
        "font": {"size": 20},
        "prefix": "Epoch:",
        "visible": True,
        "xanchor": "right"
    },
    "transition": {"duration": 300, "easing": "cubic-in-out"},
    "pad": {"b": 10, "t": 50},
    "len": 0.9,
    "x": 0.1,
    "y": 0,
    "steps": []
}

# make frames
frame_list =[]
for i in range(len(points_)):
    epoch = points_[i]
    fdata=[]
    fdata0 = go.Contour(x=m1s,y=m2s,z=Z_100,connectgaps=True,colorscale='hsv',showscale=False)
    fdata.append(fdata0)
    for j in range(len(methods)):
        fdata1=go.Scatter(y=costs[j][0:epoch],name=methods[j],
                          showlegend=False,line=dict(color=mcolors[methods[j]][0]))
        fdata2=go.Scatter(x=weights_2_5_250[j][0:epoch],
                          y=weights_2_5_251[j][0:epoch],
                          line=dict(color=mcolors[methods[j]][0],width=5),
                          name=methods[j],mode='lines')
        fdata3=go.Scatter(
            x=[weights_2_5_250[j][epoch]], 
            y=[weights_2_5_251[j][epoch]], 
            name=methods[j]+' trace',
            mode="markers", marker=dict(color=mcolors[methods[j]][1], size=10))
        fdata.append(fdata1)
        fdata.append(fdata2)
        fdata.append(fdata3)
    tlist=list(range(1+3*len(methods)))
    
    frame = go.Frame(data=fdata, traces=tlist,name= str(epoch))
    
    frame_list.append(frame)

    slider_step = {"args": [
        [str(epoch)],
        {"frame": {"duration": 300, "redraw": True},
         "mode": "immediate",
         "transition": {"duration": 1}}
    ],
        "label": str(epoch),
        "method": "animate"}
    sliders_dict["steps"].append(slider_step)
fig.frames = frame_list
fig["layout"]["sliders"] = [sliders_dict]

fig.show()